{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 决策树"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 决策树的构造"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 信息增益"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from math import log"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "香农熵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def calcShannonEnt(dataSet):\n",
    "    numEntries = len(dataSet)\n",
    "    labelCounts = {}\n",
    "    \n",
    "    for featVec in dataSet:\n",
    "        currentLabel = featVec[-1]\n",
    "        if currentLabel not in labelCounts.keys():\n",
    "            labelCounts[currentLabel] = 0\n",
    "        labelCounts[currentLabel] +=1\n",
    "    shannonEnt = 0.0 \n",
    "    for key in labelCounts:\n",
    "        prop = float(labelCounts[key]) / numEntries\n",
    "        shannonEnt -= prop * log(prop,2)\n",
    "    \n",
    "    return shannonEnt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def createDataSet():\n",
    "    dataSet = [\n",
    "        [1,1,'yes'],\n",
    "        [1,1,'yes'],\n",
    "        [1,0,'no'],\n",
    "        [0,1,'no'],\n",
    "        [0,1,'no']\n",
    "    ]\n",
    "    labels = ['no surfacing', 'flippers']\n",
    "    return dataSet, labels"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 划分数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def splitDataSet(dataSet, axis, value):\n",
    "    retDataSet=[]\n",
    "    for featVec in dataSet:\n",
    "        if featVec[axis] == value:\n",
    "            reducedFeatVec = featVec[:axis]                #提取axis前的部分\n",
    "            reducedFeatVec.extend(featVec[axis+1:])        #提取axis后的部分\n",
    "            retDataSet.append(reducedFeatVec)\n",
    "    return retDataSet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "myDat, labels = createDataSet()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[1, 'no'], [1, 'no']]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "splitDataSet(myDat,0,0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 盐\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `set()`\n",
    "- 从一个迭代器创造一个`set`\n",
    "- `set`中不允许重复，只保留唯一的值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def chooseBestFeatureToSplit(dataSet):\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
